{
 "cells": [
  {
   "cell_type": "markdown",
   "source": [
    "### 英文句子中的词性标注\n",
    "名词，动词等\n"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%% md\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "\n",
    "# 初始化词典\n",
    "tag2id, id2tag = {},{}\n",
    "word2id, id2word = {},{}\n",
    "# 建立词典\n",
    "for line in open(\"../data/traindata.txt\"):\n",
    "    items = line.split(\"/\")\n",
    "    # items[1] 后面后一个换行符 \\n 要去掉\n",
    "    word, tag = items[0], items[1].rstrip()\n",
    "    if word not in word2id:\n",
    "        word2id[word]=len(word2id)\n",
    "        id2word[len(id2word)] = word\n",
    "    if tag not in tag2id:\n",
    "        tag2id[tag] = len(tag2id)\n",
    "        id2tag[len(id2tag)] = tag\n"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n",
     "is_executing": false
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "outputs": [],
   "source": [
    "# 初始化参数\n",
    "M = len(word2id)\n",
    "N = len(tag2id)\n",
    "\n",
    "pi = np.zeros(N)    # 初始概率\n",
    "B = np.zeros((N,M))    # 发射\n",
    "A = np.zeros((N,N))    # 状态转移"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n",
     "is_executing": false
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "outputs": [],
   "source": [
    "# 统计参数\n",
    "prev_tag = \"\"\n",
    "for line in open(\"../data/traindata.txt\"):\n",
    "    items = line.split(\"/\")\n",
    "    # 获取词和标签的ID\n",
    "    wordId, tagId = word2id[items[0]], tag2id[items[1].rstrip()]\n",
    "    if prev_tag == \"\":\n",
    "        pi[tagId] += 1\n",
    "        # 计算发射概率的参数，词和标签的对应次数\n",
    "        B[tagId][wordId] += 1\n",
    "    else:\n",
    "        B[tagId][wordId] += 1\n",
    "        # 转移概率的参数\n",
    "        A[tag2id[prev_tag]][tagId] += 1\n",
    "    # 句末重置prev\n",
    "    if items[0] == \".\":\n",
    "        prev_tag = \"\"\n",
    "    else:\n",
    "        prev_tag = items[1].rstrip()"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n",
     "is_executing": false
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "outputs": [],
   "source": [
    "# 转化成概率的形式，做归一化\n",
    "pi = pi/sum(pi)\n",
    "for i in range(N):\n",
    "    A[i] /= sum(A[i])\n",
    "    B[i] /= sum(B[i])"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n",
     "is_executing": false
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "outputs": [],
   "source": [
    "# print(f\"pi:{pi}\")\n",
    "def log(v):\n",
    "    if v == 0:\n",
    "        return np.log(v+0.000001)\n",
    "    return np.log(v)"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n",
     "is_executing": false
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "outputs": [],
   "source": [
    "# 维特比算法\n",
    "def viterbi(x,pi,A, B):\n",
    "    x = [word2id[word] for word in x.split(\" \")]\n",
    "    T = len(x)\n",
    "    dp = np.zeros((T,N))    # 动态规划组\n",
    "    ptr = np.zeros((T,N),dtype=int)    # 保存最大路径对应得下标，也就是结果\n",
    "    for j in range(N):\n",
    "        # 对数相加等于真数相乘\n",
    "        # 等于初始概率x发射概率\n",
    "        dp[0][j] = log(pi[j]) + log(B[j][x[0]])\n",
    "    for i in range(1,T):\n",
    "        for j in range(N):\n",
    "            dp[i][j] = -9999999\n",
    "            for k in range(N):    # 这里时算上一个列\n",
    "                score = dp[i-1][k]+log(A[k][j])+log(B[j][x[i]])\n",
    "                if score > dp[i][j]:\n",
    "                    dp[i][j] = score\n",
    "                    ptr[i][j] = k\n",
    "    best_seq = [0]*T\n",
    "    print(best_seq)\n",
    "    best_seq[T-1] = np.argmax(dp[T-1])\n",
    "    for i in range(T-2,-1,-1):\n",
    "        best_seq[i] = ptr[i+1][best_seq[i+1]]\n",
    "    for i in range(len(best_seq)):\n",
    "        print(id2tag[best_seq[i]])"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n",
     "is_executing": false
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "outputs": [
    {
     "name": "stdout",
     "text": [
      "[0, 0, 0, 0, 0]\n",
      "PRP\n",
      "VBP\n",
      "TO\n",
      "VB\n",
      "NN\n"
     ],
     "output_type": "stream"
    }
   ],
   "source": [
    "x = \"I like to play ball\"\n",
    "viterbi(x, pi, A, B)"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n",
     "is_executing": false
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "outputs": [],
   "source": [
    "\n",
    "\n",
    "\n",
    "\n"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n",
     "is_executing": false
    }
   }
  }
 ],
 "metadata": {
  "kernelspec": {
   "name": "pycharm-b7e0bb6f",
   "language": "python",
   "display_name": "PyCharm (Reading_Comprehension)"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  },
  "pycharm": {
   "stem_cell": {
    "cell_type": "raw",
    "source": [],
    "metadata": {
     "collapsed": false
    }
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}